top_coef_plot <- function(final_model, model_name, ds_name, enet=FALSE, reactome=FALSE, reactome_info){
# compute coefficients from model
if (enet){
coefficients <- as.data.frame(as.matrix(coef(model_ls$model_fit$finalModel, (model_ls$model_fit$bestTune$lambda))))
}
coefficients = coef(final_model)
sum.coef = sum(sapply(coefficients, abs))
coefficients = coefficients * 100 / sum.coef
coefficients = sort(coefficients[, 1 , 1])
# Create top coef data frame
rc_pos <- data.frame(c(head(coefficients, 5)))
colnames(rc_pos) <- c('pos_pred')
rc_pos$ID <- rownames(rc_pos)
rc_pos <- reshape2::melt(rc_pos)
rc_neg <- data.frame(c(tail(coefficients, 5)))
colnames(rc_neg) <- c('neg_pred')
rc_neg$ID <- rownames(rc_neg)
rc_neg <- reshape2::melt(rc_neg)
rc <- rbind(rc_pos, rc_neg)
print(rc$ID[grep('reactome', rc$ID)])
if(reactome) {
reactome_rows <- grep('reactome', rc$ID)
k <- which(reactome_info$path_ID %in% gsub('rna_', '', rc$ID[reactome_rows]))
rc$ID[reactome_rows] <- reactome_info$path_names[k]
}
# Plot
p <- ggplot(data=rc, aes(reorder(ID, value), value, fill=variable)) +
geom_bar(stat="identity") +
theme_minimal() +
ggtitle(paste(model_name, ' model with ', ds_name)) +
xlab("Variable") +
ylab("Regression coefficient") +
coord_flip() +
theme(legend.position="none", plot.title = element_text(hjust = 0.5))
p
}
ftimp_gg <- function(model_ls, n_topft = 10, model_name, ds_name, reactome=FALSE, reactome_info){
require('ggplot2')
require('iml')
# Feature Importance
ftimp_rf <- caret::varImp(model_ls$model_fit)
ftimp_rf <- ftimp_rf$importance %>%
dplyr::arrange(desc(.)) %>%
dplyr::top_n(n_topft) %>%
mutate(Features = rownames(.))
top_fts <- ftimp_rf$Features
if(reactome) {
reactome_rows <- grep('reactome', ftimp_rf$Features)
k <- which(reactome_info$path_ID %in% gsub('rna_', '', ftimp_rf$Features[reactome_rows]))
ftimp_rf$Features[reactome_rows] <- reactome_info$path_names[k]
}
print(ggplot(ftimp_rf, aes(x=reorder(Features, Overall), y=Overall)) +
geom_bar(stat="identity", fill='steelblue') +
labs(x ="Features", y = "Importance %")+
coord_flip() +
theme_minimal() +
ggtitle(paste(model_name, ' model with ', ds_name)) +
theme(legend.position="none", plot.title = element_text(hjust = 0.5)))
# Feature effects
X = model_ls$TrainSet[which(names(model_ls$TrainSet) != "SILA")]
y = model_ls$TrainSet$SILA
predictor <- Predictor$new(model_ls$model_fit, data = X, y = y)
print(plot(FeatureEffects$new(predictor, features = top_fts)))
}
frac_dif <- function(model_ls){
frac_dif <- ((model_ls$pred/model_ls$TestSet$SILA)-1)*100
}
err_df <- data.frame('Model'=NA, 'RMSE'=NA, 'MAE'=NA)
reactome_info <- read.csv('/Users/senosam/Documents/Massion_lab/data_integration/models/data/rna_reactomeinfo.csv', row.names = 1)
ds_name = 'CyTOF frequencies'
model_ds <- models_by_ds$cytof_freq
model_ls <- model_ds$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.1472328
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.1394869
model_ls$model_fit
## Partial Least Squares
##
## 34 samples
## 39 predictors
##
## Pre-processing: centered (39), scaled (39)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.1956908 0.1483690 0.1563369
## 2 0.2302795 0.1324251 0.1852570
## 3 0.2351941 0.1341671 0.1906296
## 4 0.2751348 0.1689894 0.2165516
## 5 0.3261610 0.1985935 0.2504491
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(model_ls$model_fit)
err_df[1,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.
top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name)
## Using ID as id variables
## Using ID as id variables
## character(0)
model_ls <- model_ds$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.138978
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.1320495
model_ls$model_fit
## Random Forest
##
## 34 samples
## 39 predictors
##
## Pre-processing: centered (39), scaled (39)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.1753103 0.1700511 0.1477809
## 2 extratrees 0.1699220 0.1646694 0.1421207
## 11 variance 0.1768414 0.1525850 0.1497036
## 11 extratrees 0.1716088 0.1296286 0.1447778
## 20 variance 0.1768067 0.1442666 0.1492415
## 20 extratrees 0.1717616 0.1124466 0.1451755
## 29 variance 0.1770195 0.1438997 0.1496551
## 29 extratrees 0.1721370 0.1219529 0.1454077
## 39 variance 0.1769284 0.1383713 0.1494189
## 39 extratrees 0.1728073 0.1142058 0.1459092
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
## and min.node.size = 5.
plot(model_ls$model_fit)
err_df[2,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name)
## Loading required package: iml
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".
ds_name = 'CyTOF bulk expression'
model_ds <- models_by_ds$cytof_medianprot
model_ls <- model_ds$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.1758155
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.1420185
model_ls$model_fit
## Partial Least Squares
##
## 34 samples
## 34 predictors
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.1859154 0.1526097 0.1513773
## 2 0.2150547 0.2081635 0.1757809
## 3 0.2326452 0.2062358 0.1846154
## 4 0.2898711 0.2588294 0.2225834
## 5 0.3474757 0.2764505 0.2637017
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(model_ls$model_fit)
err_df[3,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.
top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name)
## Using ID as id variables
## Using ID as id variables
## character(0)
model_ls <- model_ds$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.1487374
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.134784
model_ls$model_fit
## Random Forest
##
## 34 samples
## 34 predictors
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.1687446 0.1612905 0.1431078
## 2 extratrees 0.1678348 0.1246672 0.1419484
## 10 variance 0.1740050 0.1427438 0.1450840
## 10 extratrees 0.1712549 0.1224263 0.1444594
## 18 variance 0.1761271 0.1427009 0.1463892
## 18 extratrees 0.1719575 0.1240482 0.1446421
## 26 variance 0.1780171 0.1360151 0.1478463
## 26 extratrees 0.1728129 0.1434599 0.1454456
## 34 variance 0.1792265 0.1342647 0.1485463
## 34 extratrees 0.1737264 0.1351547 0.1461453
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
## and min.node.size = 5.
plot(model_ls$model_fit)
err_df[4,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name)
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".
ds_name = 'RNA Seq REACTOME scores'
model_ds <- models_by_ds$rna_reactome
model_ls <- model_ds$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.1474013
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.1333214
model_ls$model_fit
## Partial Least Squares
##
## 34 samples
## 1838 predictors
##
## Pre-processing: centered (1838), scaled (1838)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.1739414 0.1421956 0.1405954
## 2 0.1803449 0.1893151 0.1425230
## 3 0.2004195 0.1374260 0.1614717
## 4 0.2023606 0.1386374 0.1609985
## 5 0.2015930 0.1528425 0.1600130
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(model_ls$model_fit)
err_df[5,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.
top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name, reactome = TRUE, reactome_info = reactome_info)
## Using ID as id variables
## Using ID as id variables
## [1] "reactome_1693" "reactome_1527" "reactome_803" "reactome_558"
## [5] "reactome_1143" "reactome_1495" "reactome_1374" "reactome_979"
## [9] "reactome_1525" "reactome_1077"
model_ls <- model_ds$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.1507297
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.1414097
model_ls$model_fit
## Random Forest
##
## 34 samples
## 1838 predictors
##
## Pre-processing: centered (1838), scaled (1838)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.1618905 0.1546546 0.1358869
## 2 extratrees 0.1605926 0.1699014 0.1339465
## 11 variance 0.1650863 0.1485923 0.1380268
## 11 extratrees 0.1626671 0.1505702 0.1352888
## 60 variance 0.1666801 0.1774619 0.1384627
## 60 extratrees 0.1646094 0.1670552 0.1372455
## 333 variance 0.1684221 0.1757368 0.1394101
## 333 extratrees 0.1662657 0.1799412 0.1384324
## 1838 variance 0.1711252 0.1676104 0.1410888
## 1838 extratrees 0.1669614 0.1802420 0.1383647
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
## and min.node.size = 5.
plot(model_ls$model_fit)
err_df[6,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name, reactome=TRUE, reactome_info=reactome_info)
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".
ds_name = 'RNA Seq VIPER TF activity'
model_ds <- models_by_ds$rna_viper4
model_ls <- model_ds$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.1463339
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.130622
model_ls$model_fit
## Partial Least Squares
##
## 34 samples
## 118 predictors
##
## Pre-processing: centered (118), scaled (118)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.1730487 0.1608107 0.1386264
## 2 0.2212056 0.1263251 0.1817183
## 3 0.2210563 0.1134532 0.1803047
## 4 0.2232691 0.1298120 0.1812632
## 5 0.2264314 0.1316729 0.1802890
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(model_ls$model_fit)
err_df[7,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.
top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name)
## Using ID as id variables
## Using ID as id variables
## character(0)
model_ls <- model_ds$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.1424572
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.1334272
model_ls$model_fit
## Random Forest
##
## 34 samples
## 118 predictors
##
## Pre-processing: centered (118), scaled (118)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.1640078 0.1543702 0.1354459
## 2 extratrees 0.1622847 0.1548528 0.1335238
## 31 variance 0.1657500 0.1768016 0.1354061
## 31 extratrees 0.1652939 0.1537369 0.1361517
## 60 variance 0.1656851 0.1703241 0.1350524
## 60 extratrees 0.1644074 0.1720150 0.1351572
## 89 variance 0.1652619 0.1926147 0.1349360
## 89 extratrees 0.1645177 0.1821421 0.1353334
## 118 variance 0.1651127 0.1891581 0.1342650
## 118 extratrees 0.1648454 0.1794311 0.1349291
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
## and min.node.size = 5.
plot(model_ls$model_fit)
err_df[8,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name)
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".
ds_name = 'WES mutation'
model_ds <- models_by_ds$wes_binary
model_ls <- model_ds$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.154442
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.1396076
model_ls$model_fit
## Partial Least Squares
##
## 34 samples
## 169 predictors
##
## Pre-processing: centered (169), scaled (169)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.1907319 0.1875453 0.1558135
## 2 0.1825731 0.1722682 0.1476898
## 3 0.1881260 0.1785514 0.1509093
## 4 0.1949080 0.1641259 0.1555165
## 5 0.2018338 0.1496834 0.1600058
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.
plot(model_ls$model_fit)
err_df[9,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.
top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name)
## Using ID as id variables
## Using ID as id variables
## character(0)
model_ls <- model_ds$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.1393409
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.1278576
model_ls$model_fit
## Random Forest
##
## 34 samples
## 169 predictors
##
## Pre-processing: centered (169), scaled (169)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.1659472 0.1558093 0.1398201
## 2 extratrees 0.1660705 0.1563622 0.1396717
## 43 variance 0.1710692 0.1540223 0.1468873
## 43 extratrees 0.1711343 0.1549991 0.1472228
## 85 variance 0.1741523 0.1647829 0.1509367
## 85 extratrees 0.1746352 0.1736047 0.1511754
## 127 variance 0.1771702 0.1751928 0.1538010
## 127 extratrees 0.1773455 0.1772299 0.1539032
## 169 variance 0.1796438 0.1772324 0.1555598
## 169 extratrees 0.1793553 0.1822027 0.1555469
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = variance
## and min.node.size = 5.
plot(model_ls$model_fit)
err_df[10,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name)
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".
ds_name = 'Concatenated Data'
load('/Users/senosam/Documents/Massion_lab/data_integration/models/models_res/models_CONCAT_ls.RData')
model_ls <- models_CONCAT_ls$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.1476102
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.1328039
model_ls$model_fit
## Partial Least Squares
##
## 34 samples
## 2198 predictors
##
## Pre-processing: centered (2198), scaled (2198)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.1761748 0.1328592 0.1422889
## 2 0.1836616 0.1718236 0.1448280
## 3 0.1996644 0.1216580 0.1592344
## 4 0.1967117 0.1226756 0.1540897
## 5 0.1937543 0.1253861 0.1502014
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(model_ls$model_fit)
err_df[11,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.
# , reactome = TRUE, reactome_info = reactome_info
top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name, reactome = TRUE, reactome_info = reactome_info)
## Using ID as id variables
## Using ID as id variables
## [1] "rna_reactome_1693" "rna_reactome_1527" "rna_reactome_803"
## [4] "rna_reactome_1495" "rna_reactome_1374" "rna_reactome_979"
## [7] "rna_reactome_1525" "rna_reactome_1077"
model_ls <- models_CONCAT_ls$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test: 0.1468419
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test: 0.1382979
model_ls$model_fit
## Random Forest
##
## 34 samples
## 2198 predictors
##
## Pre-processing: centered (2198), scaled (2198)
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.1629314 0.1360342 0.1362809
## 2 extratrees 0.1615130 0.1683173 0.1345702
## 11 variance 0.1646622 0.1563520 0.1378350
## 11 extratrees 0.1632687 0.1556167 0.1362748
## 66 variance 0.1662447 0.1686240 0.1381101
## 66 extratrees 0.1646850 0.1479140 0.1373843
## 381 variance 0.1688350 0.1588761 0.1394662
## 381 extratrees 0.1655816 0.1636630 0.1367020
## 2198 variance 0.1721210 0.1484698 0.1419690
## 2198 extratrees 0.1674119 0.1672822 0.1382225
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
## and min.node.size = 5.
plot(model_ls$model_fit)
err_df[12,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))
ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name, reactome=TRUE, reactome_info=reactome_info)
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".
fd_cytoffreq_RF <- frac_dif(models_by_ds$cytof_freq$model_RF)
fd_cytoffreq_PLS <- frac_dif(models_by_ds$cytof_freq$model_PLS)
fd_cytofmed_RF <- frac_dif(models_by_ds$cytof_medianprot$model_RF)
fd_cytofmed_PLS <- frac_dif(models_by_ds$cytof_medianprot$model_PLS)
fd_rnareac_RF <- frac_dif(models_by_ds$rna_reactome$model_RF)
fd_rnareac_PLS <- frac_dif(models_by_ds$rna_reactome$model_PLS)
fd_rnaviper_RF <- frac_dif(models_by_ds$rna_viper4$model_RF)
fd_rnaviper_PLS <- frac_dif(models_by_ds$rna_viper4$model_PLS)
fd_wes_RF <- frac_dif(models_by_ds$wes_binary$model_RF)
fd_wes_PLS <- frac_dif(models_by_ds$wes_binary$model_PLS)
fd_cc_RF <- frac_dif(models_CONCAT_ls$model_RF)
fd_cc_PLS <- frac_dif(models_CONCAT_ls$model_PLS)
## Using as id variables
err_df$MAE <- as.numeric(err_df$MAE)
err_df$RMSE <- as.numeric(err_df$RMSE)
err_df$MAE <- round(err_df$MAE, digits = 3)
err_df$RMSE <- round(err_df$RMSE, digits = 3)
knitr::kable(err_df)
| Model | RMSE | MAE |
|---|---|---|
| CyTOF frequencies: PLS | 0.147 | 0.139 |
| CyTOF frequencies: RF | 0.139 | 0.132 |
| CyTOF bulk expression: PLS | 0.176 | 0.142 |
| CyTOF bulk expression: RF | 0.149 | 0.135 |
| RNA Seq REACTOME scores: PLS | 0.147 | 0.133 |
| RNA Seq REACTOME scores: RF | 0.151 | 0.141 |
| RNA Seq VIPER TF activity: PLS | 0.146 | 0.131 |
| RNA Seq VIPER TF activity: RF | 0.142 | 0.133 |
| WES mutation: PLS | 0.154 | 0.140 |
| WES mutation: RF | 0.139 | 0.128 |
| Concatenated Data: PLS | 0.148 | 0.133 |
| Concatenated Data: RF | 0.147 | 0.138 |
test_true <- traintest_info[which(traintest_info$ds=='test'), 'n_op2']
res <- matrix(NA, nrow = length(which(traintest_info$ds=='test')), ncol = length(names(models_by_ds)))
colnames(res) <- names(models_by_ds)
model_nm = 'model_RF'
for(ds in names(models_by_ds)){
x <- models_by_ds[[ds]][[model_nm]]$pred
for(i in 1:length(x)){
if(x[i] <0.4) {
res[i,ds] <- 'ind'
} else if(x[i] > 0.4 && x[i] <= 0.6) {
res[i,ds] <- 'int'
} else {
res[i,ds] <- 'agg'
}
}
}
pred_maj <- c()
for(i in 1:nrow(res)){
x <- which.max(table(res[i,]))
pred_maj <- c(pred_maj, names(x))
}
confusionMatrix(as.factor(pred_maj), as.factor(test_true))
## [,1] [,2]
## [1,] 0 0
## [2,] 0 6
res <- matrix(NA, nrow = length(which(traintest_info$ds=='test')), ncol = length(names(models_by_ds)))
colnames(res) <- names(models_by_ds)
model_nm = 'model_PLS'
for(ds in names(models_by_ds)){
x <- models_by_ds[[ds]][[model_nm]]$pred
for(i in 1:length(x)){
if(x[i] <0.4) {
res[i,ds] <- 'ind'
} else if(x[i] > 0.4 && x[i] <= 0.6) {
res[i,ds] <- 'int'
} else {
res[i,ds] <- 'agg'
}
}
}
pred_maj <- c()
for(i in 1:nrow(res)){
x <- which.max(table(res[i,]))
pred_maj <- c(pred_maj, names(x))
}
confusionMatrix(as.factor(pred_maj), as.factor(test_true))
## [,1] [,2]
## [1,] 0 0
## [2,] 0 7
x <- models_CONCAT_ls$model_RF$pred
res <- c()
for(i in 1:length(x)){
if(x[i] <0.4) {
res <- c(res,'ind')
} else if(x[i] > 0.4 && x[i] <= 0.6) {
res <- c(res,'int')
} else {
res <- c(res,'agg')
}
}
confusionMatrix(as.factor(res), as.factor(test_true))
## [,1] [,2]
## [1,] 0 0
## [2,] 0 4
x <- models_CONCAT_ls$model_PLS$pred
res <- c()
for(i in 1:length(x)){
if(x[i] <0.4) {
res <- c(res,'ind')
} else if(x[i] > 0.4 && x[i] <= 0.6) {
res <- c(res,'int')
} else {
res <- c(res,'agg')
}
}
confusionMatrix(as.factor(res), as.factor(test_true))
## [,1] [,2]
## [1,] 0 0
## [2,] 0 5
mmetrics <- data.frame(Approach=c('Ensemble', 'Ensemble', 'Concatenation', 'Concatenation'),
Algorithm=c('RF', 'PLS', 'RF', 'PLS'),
Accuracy=c(0.7273,0.8182,0.5455,0.6364),
Sensitivity=c(0.6667,0.7778,0.4444,0.5556),
NVP=c(0.4,0.5,0.2857,0.3333))
mmetrics <- reshape2::melt(mmetrics)
## Using Approach, Algorithm as id variables
sbst <- mmetrics[mmetrics$variable== 'Accuracy',]
ggplot(mmetrics, aes(fill=Approach, y=value, x=Algorithm)) +
geom_bar(position="dodge", stat="identity") +
theme_minimal() +
ylim(c(0,1)) +
labs(title=NULL,x=NULL, y =NULL) +
scale_fill_brewer(palette="Dark2")+
facet_wrap(~variable, scales='free') +
theme(strip.text.x = element_text(size = 18),
axis.text.x =element_text(size = 15),
axis.text.y =element_text(size = 12))